{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f043313e-8922-464a-b2c0-828d468258ae",
   "metadata": {},
   "source": [
    "# Lesson 2: Hallucinations\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28d77077-61e6-474d-8482-3089dbf8aa0b",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "import helpers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f0606c8-7f82-486a-b836-925de36c7255",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "import evaluate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b22efd43-98b3-487d-ae84-8f5fb9a0c819",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "053d94de-81ff-4255-8a89-4a0604a441cb",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "pd.set_option('display.max_colwidth', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e19a4b72-198c-42e2-972e-7778ee6a0beb",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "chats = pd.read_csv(\"./chats.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7f49369-07db-4450-9d2d-7dc97fe4e8d4",
   "metadata": {},
   "source": [
    "## Prompt-response relevance\n",
    "\n",
    "### 1. BLEU score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fafacc0-1752-4bf7-b6a2-e2cfffdfe681",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "bleu = evaluate.load(\"bleu\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12bf829b-3599-4bcc-ad18-02c11af9419d",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "chats[5:6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19b85437-6ce4-48c4-8ea8-72b3a6ebef77",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "bleu.compute(predictions=[chats.loc[2, \"response\"]], \n",
    "             references=[chats.loc[2, \"prompt\"]], \n",
    "             max_order=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "58100b87-d95b-42fe-9658-eb2ca856b8a7",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "from whylogs.experimental.core.udf_schema import register_dataset_udf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "652ac6aa-00a9-4e8c-a22f-00a309be29c2",
   "metadata": {
    "height": 267
   },
   "outputs": [],
   "source": [
    "@register_dataset_udf([\"prompt\", \"response\"], \n",
    "                      \"response.bleu_score_to_prompt\")\n",
    "\n",
    "\n",
    "def bleu_score(text):\n",
    "  scores = []\n",
    "  for x, y in zip(text[\"prompt\"], text[\"response\"]):\n",
    "    scores.append(\n",
    "      bleu.compute(\n",
    "        predictions=[x], \n",
    "        references=[y], \n",
    "        max_order=2\n",
    "      )[\"bleu\"]\n",
    "    )\n",
    "  return scores"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "00e28486",
   "metadata": {},
   "source": [
    "**Note**: To view the next visual, you may have to either hide the left-side menu bar or widen the notebook towards the right."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4460de8d-98ee-4ef4-a023-da14d3fa5d0f",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "helpers.visualize_langkit_metric(\n",
    "    chats, \n",
    "    \"response.bleu_score_to_prompt\", \n",
    "    numeric=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb296e08-6f8e-4057-8c2e-d34dd6553fd7",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "helpers.show_langkit_critical_queries(\n",
    "    chats, \n",
    "    \"response.bleu_score_to_prompt\", \n",
    "    ascending=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f9339396-5136-4664-8589-ab3bb2d98e88",
   "metadata": {},
   "source": [
    "## 2. BERT score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e252f8c-caf2-4816-9e18-4d57415314da",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "bertscore = evaluate.load(\"bertscore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb642df2-4862-47ac-87b0-3b0f02e2d40b",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "bertscore.compute(\n",
    "    predictions=[chats.loc[2, \"prompt\"]],\n",
    "    references=[chats.loc[2, \"response\"]],\n",
    "    model_type=\"distilbert-base-uncased\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13eb2ee2-6b06-4bf5-a93f-c0b7a9fa60bd",
   "metadata": {
    "height": 148
   },
   "outputs": [],
   "source": [
    "@register_dataset_udf([\"prompt\", \"response\"], \"response.bert_score_to_prompt\")\n",
    "def bert_score(text):\n",
    "  return bertscore.compute(\n",
    "      predictions=text[\"prompt\"].to_numpy(),\n",
    "      references=text[\"response\"].to_numpy(),\n",
    "      model_type=\"distilbert-base-uncased\"\n",
    "    )[\"f1\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fce4bd71",
   "metadata": {},
   "source": [
    "**Note**: To view the next visual, you may have to either hide the left-side menu bar or widen the notebook towards the right."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0fb9a04c-bbdf-4ad6-8673-2d4161f81076",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "helpers.visualize_langkit_metric(\n",
    "    chats, \n",
    "    \"response.bert_score_to_prompt\", \n",
    "    numeric=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f952cf6-17e8-40a8-a4ea-5cbb27a96f76",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "helpers.show_langkit_critical_queries(\n",
    "    chats, \n",
    "    \"response.bert_score_to_prompt\", \n",
    "    ascending=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f0ea7d2-a5f2-4498-bfca-6a756c1eb5f2",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "from whylogs.experimental.core.udf_schema import udf_schema"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6d60f85-9695-424a-90c7-fd0274f620e9",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "annotated_chats, _ = udf_schema().apply_udfs(chats)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6e39ed2a",
   "metadata": {},
   "source": [
    "**Note**: To view the next visuals, you may have to either hide the left-side menu bar or widen the notebook towards the right."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "722c1f2d-0a2d-4e25-bb06-2637f31a744d",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "helpers.evaluate_examples(\n",
    "  annotated_chats[annotated_chats[\"response.bert_score_to_prompt\"] <= 0.75],\n",
    "  scope=\"hallucination\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dcc42f45-4bbc-4f9b-b334-f47ad8807566",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "helpers.evaluate_examples(\n",
    "  annotated_chats[annotated_chats[\"response.bert_score_to_prompt\"] <= 0.6],\n",
    "  scope=\"hallucination\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ace4c5ad-3dcf-48cd-8e2d-554d91dbcea0",
   "metadata": {},
   "source": [
    "## Response self-similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09207610-80bf-4a69-8842-906baa6d735e",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "chats_extended = pd.read_csv(\"./chats_extended.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "846d1d2a-d86c-49a5-a08f-60455b53d593",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "chats_extended.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5d56e1e6-0100-498f-8e3b-4262f1e66ae2",
   "metadata": {},
   "source": [
    "## 1. Sentence embedding cosine distance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78c633db-7c24-4dd5-af5c-1d9ada188b51",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39574cb3-2058-4596-bd36-b3ac46a815bf",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "model = SentenceTransformer('all-MiniLM-L6-v2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "141397cb-df47-4e2a-ac39-2c19eee826b4",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "model.encode(\"This is a sentence to encode.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2e4959cc-bf76-4079-addd-7311c976c670",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "from sentence_transformers.util import pairwise_cos_sim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1ecf393-0371-4a8a-942d-e09a8ab0e3e4",
   "metadata": {
    "height": 284
   },
   "outputs": [],
   "source": [
    "@register_dataset_udf([\"response\", \"response2\", \"response3\"], \n",
    "                      \"response.sentence_embedding_selfsimilarity\")\n",
    "def sentence_embedding_selfsimilarity(text):\n",
    "  response_embeddings = model.encode(text[\"response\"].to_numpy())\n",
    "  response2_embeddings = model.encode(text[\"response2\"].to_numpy())\n",
    "  response3_embeddings = model.encode(text[\"response3\"].to_numpy())\n",
    "  \n",
    "  cos_sim_with_response2 = pairwise_cos_sim(\n",
    "    response_embeddings, response2_embeddings\n",
    "    )\n",
    "  cos_sim_with_response3  = pairwise_cos_sim(\n",
    "    response_embeddings, response3_embeddings\n",
    "    )\n",
    "  \n",
    "  return (cos_sim_with_response2 + cos_sim_with_response3) / 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e313e28-30cc-46ff-9fad-1f6fbc70d166",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "sentence_embedding_selfsimilarity(chats_extended)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fb711834",
   "metadata": {},
   "source": [
    "**Note**: To view the next visual, you may have to either hide the left-side menu bar or widen the notebook towards the right."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a76dd526-11c1-4e7b-9b36-a99d7b358647",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "helpers.visualize_langkit_metric(\n",
    "    chats_extended, \n",
    "    \"response.sentence_embedding_selfsimilarity\", \n",
    "    numeric=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10883b33-ef71-4f21-bdc1-5fee4a158c23",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "helpers.show_langkit_critical_queries(\n",
    "    chats_extended, \n",
    "    \"response.sentence_embedding_selfsimilarity\", \n",
    "    ascending=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30334c2b-5ad2-4cdf-9e48-67cd32966872",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "annotated_chats, _ = udf_schema().apply_udfs(chats_extended)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0ae38dc-cf41-47c5-943e-0f988bf0e3ce",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "annotated_chats.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f0575af0-edc3-4f75-9cdd-453f644b21bb",
   "metadata": {},
   "source": [
    "## 2. LLM self-evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e26e04fc-0c00-4380-8d85-342083ac7667",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "import openai"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "121376c1-b670-4c21-bedc-53558fe8c4eb",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "import helpers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92fe2111-52dc-42f1-bf7e-21d942a2414c",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "openai.api_key = helpers.get_openai_key()\n",
    "openai.base_url = helpers.get_openai_base_url()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed6520f8-aee4-4917-8831-fa013785b926",
   "metadata": {
    "height": 301
   },
   "outputs": [],
   "source": [
    "def prompt_single_llm_selfsimilarity(dataset, index):\n",
    "    return openai.ChatCompletion.create(\n",
    "        model=\"gpt-3.5-turbo\",\n",
    "        messages=[{\n",
    "            \"role\": \"system\",\n",
    "            \"content\": f\"\"\"You will be provided with a text passage \\\n",
    "            and your task is to rate the consistency of that text to \\\n",
    "            that of the provided context. Your answer must be only \\\n",
    "            a number between 0.0 and 1.0 rounded to the nearest two \\\n",
    "            decimal places where 0.0 represents no consistency and \\\n",
    "            1.0 represents perfect consistency and similarity. \\n\\n \\\n",
    "            Text passage: {dataset['response'][index]}. \\n\\n \\\n",
    "            Context: {dataset['response2'][index]} \\n\\n \\\n",
    "            {dataset['response3'][index]}.\"\"\"\n",
    "        }]\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c6a9b6d-ba6f-40cf-9ec1-dee841bbde83",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "prompt_single_llm_selfsimilarity(chats_extended, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee28ca0f-c326-43da-99d0-9d4ae50f82b0",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "chats_extended[\n",
    "chats_extended[\"response.prompted_selfsimilarity\"] <= 0.8\n",
    "]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
